# Delete all variables
rm( list = ls() )
# Import libaries
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(dplyr)
##
## Attache Paket: 'dplyr'
## Die folgenden Objekte sind maskiert von 'package:stats':
##
## filter, lag
## Die folgenden Objekte sind maskiert von 'package:base':
##
## intersect, setdiff, setequal, union
library(plotly)
##
## Attache Paket: 'plotly'
## Das folgende Objekt ist maskiert 'package:ggplot2':
##
## last_plot
## Das folgende Objekt ist maskiert 'package:stats':
##
## filter
## Das folgende Objekt ist maskiert 'package:graphics':
##
## layout
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attache Paket: 'mosaic'
## Das folgende Objekt ist maskiert 'package:Matrix':
##
## mean
## Das folgende Objekt ist maskiert 'package:plotly':
##
## do
## Die folgenden Objekte sind maskiert von 'package:dplyr':
##
## count, do, tally
## Das folgende Objekt ist maskiert 'package:ggplot2':
##
## stat
## Die folgenden Objekte sind maskiert von 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## Die folgenden Objekte sind maskiert von 'package:base':
##
## max, mean, min, prod, range, sample, sum
library(isotree)
library(party)
## Lade nötiges Paket: grid
## Lade nötiges Paket: mvtnorm
## Lade nötiges Paket: modeltools
## Lade nötiges Paket: stats4
## Lade nötiges Paket: strucchange
## Lade nötiges Paket: zoo
##
## Attache Paket: 'zoo'
## Die folgenden Objekte sind maskiert von 'package:base':
##
## as.Date, as.Date.numeric
## Lade nötiges Paket: sandwich
library(rpart)
library(caret)
##
## Attache Paket: 'caret'
## Das folgende Objekt ist maskiert 'package:mosaic':
##
## dotPlot
# Used to scale time
library(lubridate)
## Lade nötiges Paket: timechange
##
## Attache Paket: 'lubridate'
## Die folgenden Objekte sind maskiert von 'package:base':
##
## date, intersect, setdiff, union
library(scales)
##
## Attache Paket: 'scales'
## Das folgende Objekt ist maskiert 'package:mosaic':
##
## rescale
library(hms)
##
## Attache Paket: 'hms'
## Das folgende Objekt ist maskiert 'package:lubridate':
##
## hms
read_data = read.csv("Idle_Run_Tobias_Egger.csv")
motion_data <- data.frame(read_data)
There are 2893 observations
This value is available under the column n
inspect(motion_data)
##
## categorical variables:
## name class levels n missing
## 1 Creator character 1 2893 0
## 2 Activity character 2 2893 0
## distribution
## 1 Tobias Egger (100%)
## 2 Run (50.1%), Idle (49.9%)
##
## quantitative variables:
## name class min Q1 median Q3
## 1 ID integer 1.000000e+00 7.240000e+02 1.447000e+03 2.170000e+03
## 2 Sample integer 1.000000e+00 1.300000e+01 2.500000e+01 3.800000e+01
## 3 timestamp numeric 1.672264e+12 1.672265e+12 1.672265e+12 1.672332e+12
## 4 X numeric -2.091795e+01 -1.339564e+00 -2.709400e-02 1.158740e+00
## 5 Y numeric -1.174263e+01 1.241818e+00 3.385210e+00 5.582491e+00
## 6 Z numeric -1.014845e+01 6.115233e+00 8.706345e+00 1.025009e+01
## max mean sd n missing
## 1 2.893000e+03 1.447000e+03 8.352815e+02 2893 0
## 2 5.000000e+01 2.555617e+01 1.438819e+01 2893 0
## 3 1.672332e+12 1.672298e+12 3.357830e+07 2893 0
## 4 1.741553e+01 3.877723e-03 3.360706e+00 2893 0
## 5 1.808569e+01 3.400054e+00 3.745857e+00 2893 0
## 6 4.263883e+01 8.313343e+00 5.274758e+00 2893 0
head(motion_data)
tail(motion_data)
summary(motion_data)
## ID Creator Sample Activity
## Min. : 1 Length:2893 Min. : 1.00 Length:2893
## 1st Qu.: 724 Class :character 1st Qu.:13.00 Class :character
## Median :1447 Mode :character Median :25.00 Mode :character
## Mean :1447 Mean :25.56
## 3rd Qu.:2170 3rd Qu.:38.00
## Max. :2893 Max. :50.00
## timestamp X Y Z
## Min. :1.672e+12 Min. :-20.917951 Min. :-11.743 Min. :-10.148
## 1st Qu.:1.672e+12 1st Qu.: -1.339564 1st Qu.: 1.242 1st Qu.: 6.115
## Median :1.672e+12 Median : -0.027094 Median : 3.385 Median : 8.706
## Mean :1.672e+12 Mean : 0.003878 Mean : 3.400 Mean : 8.313
## 3rd Qu.:1.672e+12 3rd Qu.: 1.158740 3rd Qu.: 5.582 3rd Qu.: 10.250
## Max. :1.672e+12 Max. : 17.415534 Max. : 18.086 Max. : 42.639
names(motion_data)
## [1] "ID" "Creator" "Sample" "Activity" "timestamp" "X"
## [7] "Y" "Z"
For tobias egger, activity idle, there are 1445 observations
For tobias egger, activity run, there are 1448 observations
group_by(motion_data, Activity) %>%
ggplot(aes(x = Activity, fill=Activity)) +
geom_bar(color = c('light blue', 'dark orange')) +
facet_grid(.~Creator ) +
scale_fill_manual(values=c('light blue', 'dark orange')) +
scale_colour_manual(values=c('light blue', 'dark orange'))
activity_count <- group_by(motion_data, Activity) %>%
summarize(count=n())
activity_count
Use activities from Tobias Egger
motion_data_te <- subset(motion_data, Creator == "Tobias Egger")
Extract hours from timestamp
Link: https://www.appsloveworld.com/r/100/298/density-plot-based-on-time-of-the-day
Link: https://stackoverflow.com/questions/13456241/convert-unix-epoch-to-date-object
density_data <- data.frame(motion_data_te)
# convert character to POSIXct
density_data$timestamp <- as.POSIXct(density_data$timestamp/1000, origin="1970-01-01")
# extract hour and minute:
density_data$time <- hms::hms(second(density_data$timestamp), minute(density_data$timestamp), hour(density_data$timestamp))
# convert to POSIXct again since ggplot does not work with class hms.
density_data$time <- as.POSIXct(density_data$time)
density_data$date <-as.Date(as.POSIXct(density_data$timestamp, origin="1970-01-01"))
density_data
Plot timestamp density
ggplot(density_data, aes(x=time, fill=Activity)) +
geom_histogram(bins=(sqrt(length(density_data$Activity))),fill="white",color="black",aes(y=..density..)) +
geom_density(alpha=.3) +
scale_x_datetime(breaks = date_breaks("1 hours"), labels=date_format("%H:%m"), expand = c(0,0))
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
Density for X axis
ggplot(motion_data_te, aes(x=X, fill=Activity)) +
geom_histogram(bins=(sqrt(length(motion_data_te$Activity))),fill="white",color="black",aes(y=..density..)) +
geom_density(alpha=.3)
Density for Y axis
ggplot(motion_data_te, aes(x=Y, fill=Activity)) +
geom_histogram(bins=(sqrt(length(motion_data_te$Activity))),fill="white",color="black",aes(y=..density..)) +
geom_density(alpha=.3)
Density for Z axis
ggplot(motion_data_te, aes(x=Z, fill=Activity)) +
geom_histogram(bins=(sqrt(length(motion_data_te$Activity))),fill="white",color="black",aes(y=..density..)) +
geom_density(alpha=.3)
ggplot(motion_data_te, aes(timestamp, X)) +
geom_line()
ggplot(motion_data_te, aes(timestamp, Y)) +
geom_line()
ggplot(motion_data_te, aes(timestamp, Z)) +
geom_line()
idle_activity = subset(motion_data_te, Activity == "Idle")
idle_plot <- group_by(idle_activity, Activity) %>%
ggplot(aes(x=timestamp)) +
labs( x = "Timestamp", y = "Acceleration") +
geom_line(aes(y = X), color="dark green", alpha = 0.8) +
geom_line(aes(y = Y), color="light blue", alpha = 0.8) +
geom_line(aes(y = Z), color="dark orange", alpha = 0.8)
run_activity = subset(motion_data_te, Activity == "Run")
run_plot <- group_by(run_activity, Activity) %>%
ggplot(aes(x=timestamp)) +
labs( x = "Timestamp", y = "Acceleration") +
geom_line(aes(y = X), color="dark green", alpha = 0.8) +
geom_line(aes(y = Y), color="light blue", alpha = 0.8) +
geom_line(aes(y = Z), color="dark orange", alpha = 0.8)
Dark green: X
Light blue: Y
Dark orange: Z
ggplotly(idle_plot)
Dark green: X
Light blue: Y
Dark orange: Z
ggplotly(run_plot)
In the first three plots we can already distinguish between idle and run.
Third plot:
dark green: X
light blue: Y
dark orange: Z
plot_data <- subset(motion_data_te, select=c(X, Y, Z))
plot(motion_data_te$X)
plot(motion_data_te$Y)
plot(motion_data_te$Z)
plot(plot_data, col = c('dark green', 'light blue', 'dark orange'))
Light blue: Activity running
Dark orange: Activity idle
We can also see here that timestamp and sample correlates with 0,869 the best.
For the axis, z and y correlates with 0.079 the best.
pair_data <- subset(motion_data_te, select=c(Sample, timestamp, X, Y, Z))
ggpairs(data=pair_data, aes(color = motion_data_te$Activity), title="Species pair plot with quantiative variables") +
scale_fill_manual(values=c('light blue', 'dark orange')) +
scale_colour_manual(values=c('light blue', 'dark orange'))
I created a custom method to find the outliers plotted the IndividualID below
For timestamp, X there are 196 outliers
findoutlier <- function(x) {
return(x < quantile(x, .25) - 1.5*IQR(x) | x > quantile(x, .75) + 1.5*IQR(x))
}
data_out <-
group_by(motion_data_te, Activity) %>%
mutate(outliers = ifelse(findoutlier(timestamp) | findoutlier(X), ID, NA))
indexes <-data_out %>%
filter(outliers != "")
indexes
For timestamp, Y there are 103 outliers
motion_data_te <-
group_by(motion_data_te, Activity) %>%
mutate(outliers = ifelse(findoutlier(timestamp) | findoutlier(Y), ID, NA))
motion_data_te %>%
filter(outliers != "")
motion_data_te
motion_data_te %>%
filter(outliers != "")
Boxplot timestamp, y for activity idle and run
group_by(motion_data_te, Activity) %>%
ggplot(aes(x = timestamp, y = Y, fill = Activity)) +
geom_boxplot( color = "black") +
facet_grid(Creator ~ Activity) +
geom_text(aes(label=""), na.rm=TRUE,hjust=1) +
scale_fill_manual(values=c('light blue', 'dark orange'))+
theme(axis.text.x = element_text(angle = 90))
For timestamp, Z there are 290 outliers
motion_data_te <-
group_by(motion_data_te, Activity) %>%
mutate(outliers = ifelse(findoutlier(timestamp) | findoutlier(Z), ID, NA))
motion_data_te %>%
filter(outliers != "")
Boxplot timestamp, z for activity idle and run
group_by(motion_data_te, Activity) %>%
ggplot(aes(x = timestamp, y = Z, fill = Activity)) +
geom_boxplot( color = "black") +
facet_grid(Creator ~ Activity) +
geom_text(aes(label=""), na.rm=TRUE,hjust=1) +
scale_fill_manual(values=c('light blue', 'dark orange'))+
theme(axis.text.x = element_text(angle = 90))
In total there are 589 outliers.
Boxplot timestamp, x for activity idle and run
group_by(motion_data_te, Activity) %>%
ggplot(aes(x = timestamp, y = X, fill = Activity)) +
geom_boxplot() +
facet_grid(Creator ~ Activity) +
#geom_text(aes(label="Y"), na.rm=TRUE,hjust=1) +
scale_fill_manual(values=c('light blue', 'dark orange'))+
theme(axis.text.x = element_text(angle = 90))
Without outliers -> Didn’t work:
#data <- subset(motion_data_te, Activity == "Idle")
# get the values of the outliers
outliers_X <- boxplot(motion_data_te$X, plot = F)$out
# find the row numbers of the outliers
index_out <- match(outliers_X, motion_data_te$X)
Gather y axis outliers
# get the values of the outliers
outliers_Y <- boxplot(motion_data_te$Y, plot = F)$out
# find the row numbers of the outliers & add them to the vector "index_out"
index_out <- c(index_out, match(outliers_Y, motion_data_te$Y))
Gather z axis outliers
# get the values of the outliers
outliers_Z <- boxplot(motion_data_te$Z, plot = F)$out
# find the row numbers of the outliers & add them to the vector "index_out"
index_out <- c(index_out, match(outliers_Z, motion_data_te$Z))
Print outliers
# show the positions of the outliers in X, Y, Z
index_out
## [1] 8 9 11 14 16 18 21 22 23 24 25 26 29 31 32
## [16] 33 36 44 50 51 57 67 69 70 71 73 75 76 77 81
## [31] 82 89 94 97 99 100 102 107 114 123 126 127 130 132 134
## [46] 135 142 149 150 175 179 180 181 183 186 188 189 191 192 193
## [61] 194 195 196 197 199 200 202 234 237 239 241 242 244 245 247
## [76] 249 250 252 253 255 256 257 258 282 296 299 301 302 305 308
## [91] 309 316 317 320 322 341 347 349 350 352 353 354 356 358 359
## [106] 360 362 364 366 368 369 373 374 380 404 405 406 407 409 410
## [121] 411 414 415 416 417 418 419 420 421 425 426 429 432 433 458
## [136] 459 460 469 518 525 533 548 549 550 573 574 580 644 749 756
## [151] 757 759 761 764 765 767 768 776 777 778 779 784 785 788 807
## [166] 811 812 814 815 817 818 819 822 823 824 825 826 827 830 832
## [181] 833 834 836 837 838 839 840 841 842 843 845 846 847 892 894
## [196] 916 919 923 924 925 928 929 930 931 932 935 941 943 945 946
## [211] 947 951 952 953 959 961 962 963 967 974 998 999 1001 1002 1003
## [226] 1005 1007 1008 1009 1012 1014 1015 1016 1019 1022 1024 1026 1029 1055 1057
## [241] 1058 1059 1060 1061 1065 1106 1113 1114 1118 1122 1123 1124 1125 1126 1128
## [256] 1129 1130 1131 1132 1133 1136 1148 1161 1163 1165 1170 1171 1172 1174 1175
## [271] 1177 1178 1179 1182 1184 1185 1186 1188 1190 1193 1194 1197 1224 1232 1236
## [286] 1238 1239 1240 1249 1284 1285 1291 1292 1298 1337 1338 1340 1341 1342 1343
## [301] 1345 1349 1350 1351 1352 1353 1354 1357 1358 1362 1364 1365 1366 1389 1390
## [316] 1392 1393 1396 1398 1399 1400 1403 1413 1415 1420 1422 1423 1428 1429 1762
## [331] 1766 2215 2230 2231 2232 2241 2242 2243 2244 2245 2246 2247 2248 2249 2448
## [346] 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525
## [361] 2526 2527 2807 2808 6 9 16 23 27 30 68 71 79 94 128
## [376] 131 150 183 187 202 258 295 298 321 357 363 430 571 572 629
## [391] 781 793 841 956 1006 1027 1048 1051 1056 1080 1118 1123 1160 1167 1175
## [406] 1183 1228 1338 1349 1423 1426 1430 1638 1639 1640 1641 1642 1643 1644 1645
## [421] 1646 1647 1648 1649 1650 1651 1652 2038 2039 2040 2157 2158 2159 2160 2161
## [436] 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176
## [451] 2177 2178 2179 2180 2181 2182 2183 2184 2619 2620 2621 2622 2623 16 23
## [466] 41 52 70 81 85 94 98 100 102 106 124 125 126 132 133
## [481] 135 139 147 154 162 170 181 183 191 196 198 209 213 238 239
## [496] 242 246 248 250 252 254 256 262 266 268 270 274 276 278 296
## [511] 298 302 306 308 310 313 314 315 325 329 333 334 347 348 181
## [526] 350 354 355 356 357 361 363 365 367 374 378 379 390 394 408
## [541] 410 416 417 419 421 423 425 429 438 456 461 478 481 485 489
## [556] 542 546 558 638 749 752 760 763 766 770 777 809 810 815 816
## [571] 818 820 823 827 831 833 838 840 844 847 895 926 933 937 941
## [586] 944 964 999 1006 1007 1008 1010 1018 1020 1031 1048 1051 1059 1066 1076
## [601] 1106 1111 1118 1119 1127 1130 1132 1134 1150 1171 1175 1179 1183 1184 1187
## [616] 1222 1247 1250 1251 1253 1303 1338 1340 1343 1345 1347 1349 1351 1355 1357
## [631] 1363 1365 1367 1371 1375 1379 1383 1394 1404 1411 1419 1431 1663 1664 1665
## [646] 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1906
## [661] 1907 1908 1909 1910 1911 1912 1952 1953 1954 1955 1956 2056 2057 2058 2059
## [676] 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074
## [691] 2075 2076 2077 2078 2079 2080 2081 2082 2083 2165 2166 2167 2168 2169 2170
## [706] 2171 2172 2173 2174 2175 2176 2177 2178 2502 2503 2504 2505 2506 2507 2508
## [721] 2509 2510 2511 2512 2513 2601 2657 2658 2659 2660 2661 2662 2663 2821 2822
## [736] 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2875 2876 2877 2878
## [751] 2879 2880 2881 2882 2883 2884 2885 2886 2887
# remove outliers
data_no_out <- motion_data_te[-index_out,]
data_no_out
boxplot(data_no_out$X, plot = T)
boxplot(data_no_out$Y, plot = T)
boxplot(data_no_out$Z, plot = T)
Delete column ID
data_clf <- data.frame(motion_data_te)
data_clf <- data_clf[,!names(data_clf) %in% c("ID", "outliers", "Creator", "Sample", "timestamp")]
names(data_clf)
## [1] "Activity" "X" "Y" "Z"
Encoding of categorical data -> Creator deleted due to NaN after scaling
#data_clf$Creator <- as.numeric(factor(data_clf$Creator))
##data_clf$Activity <- as.numeric(factor(data_clf$Activity))
head(data_clf)
Scale data -> For classification I got the same result with, without scaling
data_clf_scaled <- data.frame(data_clf)
#data_clf_scaled$Sample <- scale(data_clf_scaled$Sample)
#data_clf_scaled$timestamp <- scale(data_clf_scaled$timestamp)
data_clf_scaled$X <- scale(data_clf_scaled$X)
data_clf_scaled$Y <- scale(data_clf_scaled$Y)
data_clf_scaled$Z <- scale(data_clf_scaled$Z)
head(data_clf_scaled)
Train: 80 % Test: 20 %
Used later for classification
set.seed(7)
# Create a list of 80% of the rows in the original dataset we can use for training
train_index<-createDataPartition(data_clf$Activity, p =0.80, list = FALSE)
# Select 20% of the data for testing
test_data<-data_clf[-train_index, ]
# Use the remaining 80% of data to train and validate the models
train_data<-data_clf[train_index, ]
Used only for isolation_trees
set.seed(7)
# Create a list of 80% of the rows in the original dataset we can use for training
train_index_iso<-createDataPartition(data_clf$Activity, p =0.80, list = FALSE)
# Select 20% of the data for testing
test_data_iso<-data_clf[-train_index_iso, ]
# Use the remaining 80% of data to train and validate the models
train_data_iso<-data_clf[train_index_iso, ]
Used to visualize the ID in plotly
set.seed(7)
# Create a list of 80% of the rows in the original dataset we can use for training
train_index_id<-createDataPartition(motion_data_te$Activity, p =0.80, list = FALSE)
# Select 20% of the data for testing
test_data_id<-motion_data[-train_index_id, ]
# Use the remaining 80% of data to train and validate the models
train_data_id<-motion_data[train_index_id, ]
ntrees - Number of trees. Defaults to 50.
isotree <- isolation.forest(train_data, ndim=1, ntrees=10, nthreads=1)
isotree
## Isolation Forest model
## Consisting of 10 trees
## Numeric columns: 3
## Categorical columns: 1
## Size: 494.71 KiB
Predict average train depth of the isolation forest:
avg_depth_train <- predict(isotree, train_data_iso, type="avg_depth")
par(train_data_iso)
## Warning in par(train_data_iso): "Activity" ist kein Grafikparameter
## Warning in par(train_data_iso): "X" ist kein Grafikparameter
## Warning in par(train_data_iso): "Y" ist kein Grafikparameter
## Warning in par(train_data_iso): "Z" ist kein Grafikparameter
plot(train_data_iso, avg_depth_train, col="darkred",
main="Average isolation depth Train Data")
As we can see here, the average depth for the train data is at 18.29
summary(avg_depth_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.20 16.85 19.16 18.29 20.30 22.33
Predict average test depth of the isolation forest:
avg_depth_test <- predict(isotree, test_data_iso, type="avg_depth")
par(mar = test_data_iso)
## Warning in par(mar = test_data_iso): "Activity" ist kein Grafikparameter
## Warning in par(mar = test_data_iso): "X" ist kein Grafikparameter
## Warning in par(mar = test_data_iso): "Y" ist kein Grafikparameter
## Warning in par(mar = test_data_iso): "Z" ist kein Grafikparameter
plot(test_data_iso, avg_depth_test, col="darkred",
main="Average isolation depth Test Data")
As we can see here, the average depth for the test data is at 18.187
summary(avg_depth_test)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.617 16.710 19.188 18.187 20.313 22.085
The average anomaly score for the train data is at 0.4247 or 45.10 %
Link: https://www.kaggle.com/code/norealityshows/outlier-detection-with-isolation-forest-in-r
anomaly_score_train <- predict(isotree, train_data_iso, type = "score")
summary(anomaly_score_train)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3477 0.3828 0.4040 0.4247 0.4506 0.7819
Based on the anomaly score, we can find the outliers and plot the result:
Here the threshold is set to 0.5
If the value is close to 1 the data point is likely an anomaly
if the value is smaller than 0.5, then the data point is likely to be a regular point
#predict outliers within dataset
train_data_iso$pred <- anomaly_score_train
train_data_iso$outlier <- as.factor(ifelse(train_data_iso$pred >=0.5, "outlier", "normal"))
Train data with and without outliers
head(train_data_iso)
tail(train_data_iso)
Ggplotly with train data anomalies:
Anomaly <- train_data_iso$outlier
train_anomalies_x <- group_by(train_data_iso, Activity) %>%
ggplot(aes(x = train_data_id$timestamp, y = X, color = Anomaly, text = paste("ID:", train_data_id$ID))) +
geom_point(shape = 1, alpha = 0.8) +
labs( x = "Timestamp", y = "X Axis") +
labs(alpha = "", colour="Legend") +
facet_grid(train_data_id$Creator ~ Activity) +
scale_color_manual(values=c("#42f548", "#f54842")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
train_anomalies_y <- group_by(train_data_iso, Activity) %>%
ggplot(aes(x = train_data_id$timestamp, y = Y, color = Anomaly, text = paste("ID:", train_data_id$ID))) +
geom_point(shape = 1, alpha = 0.8) +
labs( x = "Timestamp", y = "Y Axis") +
labs(alpha = "", colour="Legend") +
facet_grid(train_data_id$Creator ~ Activity) +
scale_color_manual(values=c("#42f548", "#f54842")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
train_anomalies_z <- group_by(train_data_iso, Activity) %>%
ggplot(aes(x = train_data_id$timestamp, y = Z, color = Anomaly, text = paste("ID:", train_data_id$ID))) +
geom_point(shape = 1, alpha = 0.8) +
labs( x = "Timestamp", y = "Z Axis") +
labs(alpha = "", colour="Legend") +
facet_grid(train_data_id$Creator ~ Activity) +
scale_color_manual(values=c("#42f548", "#f54842")) +
theme_bw()+
theme(axis.text.x = element_text(angle = 90))
ggplotly(train_anomalies_x)
ggplotly(train_anomalies_y)
ggplotly(train_anomalies_z)
As we can see here, there are in total 385 outliers for all two activities (threshold = 0.5):
Activity idle has 191 outliers and 965 normal data points
Activity run has 194 outliers and 965 normal data points
train_data_anomalies <- data.frame(train_data_iso)
group_by(train_data_anomalies, Activity, outlier) %>% summarize(
count = n()
)
## `summarise()` has grouped output by 'Activity'. You can override using the
## `.groups` argument.
Further we can also print out the outlier data rows:
train_data_outliers <- train_data_iso %>%filter(train_data_anomalies$outlier == "outlier")
train_data_outliers
train_data_normal <- train_data_iso %>%filter(train_data_anomalies$outlier == "normal")
train_data_normal
The average anomaly score for the test data is at 0.4272 or 42.72 %
Link: https://www.kaggle.com/code/norealityshows/outlier-detection-with-isolation-forest-in-r
anomaly_score_test <- predict(isotree, test_data_iso, type = "score")
summary(anomaly_score_test)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3517 0.3825 0.4034 0.4272 0.4535 0.6974
Based on the anomaly, we can find the outliers and plot the result:
#predict outliers within dataset
test_data_iso$pred <- anomaly_score_test
test_data_iso$outlier <- as.factor(ifelse(test_data_iso$pred >=0.50, "outlier", "normal"))
Test data with and without outliers
head(test_data_iso)
tail(test_data_iso)
Ggplotly with test data anomalies:
Anomaly <- test_data_iso$outlier
test_anomalies_x <- group_by(test_data_iso, Activity) %>%
ggplot(aes(x = test_data_id$timestamp, y = X, color = Anomaly, text = paste("ID:", test_data_id$ID))) +
geom_point(shape = 1, alpha = 0.8) +
labs( x = "Timestamp", y = "X Axis") +
labs(alpha = "", colour="Legend") +
facet_grid(test_data_id$Creator ~ Activity) +
scale_color_manual(values=c("#42f548", "#f54842")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
test_anomalies_y <- group_by(test_data_iso, Activity) %>%
ggplot(aes(x = test_data_id$timestamp, y = Y, color = Anomaly, text = paste("ID:", test_data_id$ID))) +
geom_point(shape = 1, alpha = 0.8) +
labs( x = "Timestamp", y = "Y Axis") +
labs(alpha = "", colour="Legend") +
facet_grid(test_data_id$Creator ~ Activity) +
scale_color_manual(values=c("#42f548", "#f54842")) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))
test_anomalies_z <- group_by(test_data_iso, Activity) %>%
ggplot(aes(x = test_data_id$timestamp, y = Z, color = Anomaly, text = paste("ID:", test_data_id$ID))) +
geom_point(shape = 1, alpha = 0.8) +
labs( x = "Timestamp", y = "Z Axis") +
labs(alpha = "", colour="Legend") +
facet_grid(test_data_id$Creator ~ Activity) +
scale_color_manual(values=c("#42f548", "#f54842")) +
theme_bw()+
theme(axis.text.x = element_text(angle = 90))
ggplotly(test_anomalies_x)
ggplotly(test_anomalies_y)
ggplotly(test_anomalies_z)
As we can see here, there are in total 107 outliers for all two activities (threshold = 0.5):
Activity idle has 63 outliers and 226 normal data points
Activity run has 44 outliers and 245 normal data points
test_data_anomalies <- data.frame(test_data_iso)
group_by(test_data_anomalies, Activity, outlier) %>% summarize(
count = n()
)
## `summarise()` has grouped output by 'Activity'. You can override using the
## `.groups` argument.
Further we can also print out the outlier data rows:
test_data_outliers <- test_data_iso %>%filter(test_data_anomalies$outlier == "outlier")
test_data_outliers
test_data_normal <- test_data_iso %>%filter(test_data_anomalies$outlier == "normal")
test_data_normal
So in total the isolation tree detected 285 outliers for the train data and 107 outliers for the test data.
Compared to the boxplot analysis there are in total 492 outliers.
We can also save the outliers from the sensor data for further tests.
anomalies <- data.frame(data_clf)
anomaly_score <- predict(isotree, newdata = anomalies, type = "score")
summary(anomaly_score)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.3477 0.3827 0.4039 0.4252 0.4516 0.7819
#predict outliers within dataset
anomalies$pred <- anomaly_score
anomalies$outlier <- as.factor(ifelse(anomalies$pred >=0.50, "outlier", "normal"))
outliers <- data_clf %>%filter(anomalies$outlier == "outlier")
outliers
no_outliers <- data_clf %>%filter(anomalies$outlier == "normal")
no_outliers
write.csv(no_outliers, "Idle_Run_Tobias_Egger_no_outliers.csv", row.names = FALSE)
Accuracy was at first at 100 % because sample and timestamp was not removed
set.seed(10)
control_par <- trainControl(method = "cv", number=10)
model_cv <- train(Activity~.,
data=train_data,
method="rpart",
trControl = control_par,
metric="Accuracy"
)
model_cv
## CART
##
## 2315 samples
## 3 predictor
## 2 classes: 'Idle', 'Run'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 2083, 2083, 2083, 2083, 2084, 2085, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.0449827 0.7217987 0.44367984
## 0.1730104 0.6613192 0.32285551
## 0.2621107 0.5365730 0.07240014
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0449827.
model_cv$finalModel
## n= 2315
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2315 1156 Run (0.4993521 0.5006479)
## 2) X>=-2.041005 1887 792 Idle (0.5802862 0.4197138)
## 4) X< 1.579366 1399 448 Idle (0.6797713 0.3202287) *
## 5) X>=1.579366 488 144 Run (0.2950820 0.7049180) *
## 3) X< -2.041005 428 61 Run (0.1425234 0.8574766) *
# train a Classification and Regression Trees (CART)
set.seed(34437) #34056
control_par <- trainControl(method = "boot", number=1)
model_boot <- train(Activity~.,
data=train_data,
method="rpart",
trControl = control_par,
metric="Accuracy")
model_boot
## CART
##
## 2315 samples
## 3 predictor
## 2 classes: 'Idle', 'Run'
##
## No pre-processing
## Resampling: Bootstrapped (1 reps)
## Summary of sample sizes: 2315
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.0449827 0.7604994 0.5212733
## 0.1730104 0.7400681 0.4802199
## 0.2621107 0.6447219 0.2915295
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0449827.
model_boot$finalModel
## n= 2315
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2315 1156 Run (0.4993521 0.5006479)
## 2) X>=-2.041005 1887 792 Idle (0.5802862 0.4197138)
## 4) X< 1.579366 1399 448 Idle (0.6797713 0.3202287) *
## 5) X>=1.579366 488 144 Run (0.2950820 0.7049180) *
## 3) X< -2.041005 428 61 Run (0.1425234 0.8574766) *
Therefore we take the bootstrap model
## Generate predictions
y_hats_train <- predict(object=model_boot)
## Print the accuracy
accuracy <- mean(y_hats_train == train_data$Activity)*100
accuracy
## [1] 71.79266
Confusion Matrix train data:
951 idle were correctly classified, 448 wrong
711 run were correctly classified, 205 wrong
tab1 <- table(Predicted = y_hats_train,
Actual = train_data$Activity)
tab1
## Actual
## Predicted Idle Run
## Idle 951 448
## Run 205 711
sum(diag(tab1))/sum(tab1)
## [1] 0.7179266
classified_data <- data_clf %>%filter(data_clf$Activity == y_hats_train)
## Warning in `==.default`(data_clf$Activity, y_hats_train): Länge des längeren Objektes
## ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in is.na(e1) | is.na(e2): Länge des längeren Objektes
## ist kein Vielfaches der Länge des kürzeren Objektes
classified_data
plot(model_boot, main="Model Accuracies with CART")
Extract the variable importance using varImp() to understand which variables came out to be useful.
varimp_cart <- varImp(model_boot)
plot(varimp_cart, main="Variable Importance with CART")
# Basic plot for a decision tree
plot(model_cv$finalModel,branch = T, margin = 0.1)
text(model_cv$finalModel)
## Generate predictions
y_hats_test <- predict(object=model_boot)
## Print the accuracy
accuracy <- mean(y_hats_test == test_data$Activity)*100
## Warning in `==.default`(y_hats_test, test_data$Activity): Länge des längeren Objektes
## ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in is.na(e1) | is.na(e2): Länge des längeren Objektes
## ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in `==.default`(y_hats_test, test_data$Activity): Länge des längeren Objektes
## ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in is.na(e1) | is.na(e2): Länge des längeren Objektes
## ist kein Vielfaches der Länge des kürzeren Objektes
accuracy
## [1] 51.74946
classified_data <- data_clf %>%filter(data_clf$Activity == y_hats_test)
## Warning in `==.default`(data_clf$Activity, y_hats_test): Länge des längeren Objektes
## ist kein Vielfaches der Länge des kürzeren Objektes
## Warning in is.na(e1) | is.na(e2): Länge des längeren Objektes
## ist kein Vielfaches der Länge des kürzeren Objektes
classified_data
read_data = read.csv("Idle_Run_Tobias_Egger_no_outliers.csv")
motion_data_te_no_outliers <- data.frame(read_data)
set.seed(7)
# Create a list of 80% of the rows in the original dataset we can use for training
train_index<-createDataPartition(motion_data_te_no_outliers$Activity, p =0.80, list = FALSE)
# Select 20% of the data for testing
test_data<-motion_data_te_no_outliers[-train_index, ]
# Use the remaining 80% of data to train and validate the models
train_data<-motion_data_te_no_outliers[train_index, ]
Cross validation achieves 72.28 % for the train data -> Only a little bit better
set.seed(10)
control_par <- trainControl(method = "cv", number=10)
model_cv_iso <- train(Activity~.,
data=train_data,
method="rpart",
trControl = control_par,
metric="Accuracy"
)
model_cv_iso
## CART
##
## 2038 samples
## 3 predictor
## 2 classes: 'Idle', 'Run'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 1834, 1834, 1834, 1834, 1835, 1834, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.05677291 0.7228542 0.4479339
## 0.17629482 0.6726779 0.3495759
## 0.24402390 0.5819309 0.1629842
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05677291.
Bootstrap achieves 75.03 % for the train data -> 1 % accuracy decrease
# train a Classification and Regression Trees (CART)
set.seed(34437) #34056
control_par <- trainControl(method = "boot", number=1)
model_boot_iso <- train(Activity~.,
data=train_data,
method="rpart",
trControl = control_par,
metric="Accuracy")
model_boot_iso
## CART
##
## 2038 samples
## 3 predictor
## 2 classes: 'Idle', 'Run'
##
## No pre-processing
## Resampling: Bootstrapped (1 reps)
## Summary of sample sizes: 2038
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.05677291 0.7503374 0.5057702
## 0.17629482 0.6396761 0.2970214
## 0.24402390 0.6396761 0.2970214
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05677291.
read_data = read.csv("Idle_Run_Tobias_Egger_no_labels.csv")
unlabeled_data <- data.frame(read_data)
head(unlabeled_data)
unlabeled_data <- unlabeled_data[,!names(unlabeled_data) %in% c("ID", "Creator", "Sample", "timestamp")]
names(unlabeled_data)
## [1] "Activity" "X" "Y" "Z"
## Generate predictions
predictions <- predict(object=model_boot, newdata = unlabeled_data)
unlabeled_data$Activity = predictions
unlabeled_data
write.csv(unlabeled_data, "Idle_Run_Tobias_Egger_Test_Data_Results.csv", row.names = FALSE)
Conclusion:
OUtliers don’t always need to be removed
-> e.g. isotree: Threshold could be more optimized to get better results
-> Isotree detected 285 outliers for the train data and 107 outliers for the test data
Final classification model so far: bootstrap -> Achieves 76.04 % on train data and 51.74946 % on test data